{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is 'last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from random import sample\n",
    "from collections import Counter, defaultdict\n",
    "\n",
    "from tqdm import tqdm\n",
    "from azure.cosmos.cosmos_client import CosmosClient\n",
    "\n",
    "import sys\n",
    "sys.path.append('../../../')  # CameraTraps repository base dir\n",
    "\n",
    "from data_management.megadb.megadb_utils import MegadbUtils"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Useful queries\n",
    "\n",
    "Example queries for the MegaDB's `sequences` table. MegaDB is a NoSQL database on Azure Cosmos DB.\n",
    "\n",
    "Cosmos DB Python SDK (pre-release) documentation: https://azuresdkdocs.blob.core.windows.net/$web/python/azure-cosmos/4.0.0b5/index.html \n",
    "\n",
    "Subquery for nested objects: https://docs.microsoft.com/en-us/azure/cosmos-db/sql-query-subquery"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Connect to the Cosmos DB instance\n",
    "\n",
    "`COSMOS_ENDPOINT` and `COSMOS_KEY` need to be environment variables. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize Cosmos DB client\n",
    "url = os.environ['COSMOS_ENDPOINT']\n",
    "key = os.environ['COSMOS_KEY']\n",
    "client = CosmosClient(url, credential=key)\n",
    "\n",
    "database = client.get_database_client('camera-trap')\n",
    "container_sequences = database.get_container_client('sequences')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get the `datasets` table\n",
    "\n",
    "which records the location and access levels of each dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 38.5 ms, sys: 5.2 ms, total: 43.7 ms\n",
      "Wall time: 1.08 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "megadb_utils = MegadbUtils()\n",
    "datasets = megadb_utils.get_datasets_table()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### List the public datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['caltech', 'wcs', 'nacti', 'snapshotserengeti', 'bellevue_190602']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "public_datasets = [d['dataset_name'] for d in datasets.values() if 'public' in d['access']]\n",
    "public_datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Examples\n",
    "\n",
    "In the examples, we limit the selection to a few entries using the `TOP` keyword. When using the DB to create datasets, delete the TOP keyword and arg."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### How many entries are in a dataset?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'count': 140900}\n",
      "CPU times: user 18.9 ms, sys: 2.83 ms, total: 21.7 ms\n",
      "Wall time: 403 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT COUNT(seq) as count\n",
    "FROM sequences seq\n",
    "'''\n",
    "\n",
    "partition_key = 'sulross_kitfox'  # use None if querying across all partitions\n",
    "\n",
    "if partition_key is not None:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      partition_key=partition_key)\n",
    "else:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      enable_cross_partition_query=True)\n",
    "    \n",
    "for res in result_iterable:\n",
    "    print(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View some entries from a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 10\n",
      "CPU times: user 7.06 ms, sys: 2.34 ms, total: 9.4 ms\n",
      "Wall time: 143 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT TOP 10 *\n",
    "FROM sequences seq\n",
    "'''\n",
    "\n",
    "partition_key = 'sulross_kitfox'  # use None if querying across all partitions\n",
    "\n",
    "if partition_key is not None:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      partition_key=partition_key)\n",
    "else:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### What datasets are there that actually have data?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 23\n",
      "CPU times: user 452 ms, sys: 45.1 ms, total: 497 ms\n",
      "Wall time: 5min 8s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "result_iterable = container_sequences.query_items(\n",
    "    query='''\n",
    "SELECT DISTINCT seq.dataset\n",
    "FROM sequences seq\n",
    "''',\n",
    "    enable_cross_partition_query=True\n",
    ")\n",
    "\n",
    "results = [item['dataset'] for item in result_iterable]\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Image entries in a dataset with class \"empty\" - demonstrating JOIN\n",
    "Can use `partition_key` for this query without needing joins\n",
    "\n",
    "Refer to https://docs.microsoft.com/en-us/azure/cosmos-db/sql-query-join - query is constructed as follows (thanks to Chris Ritchie):\n",
    "1. Iterate through each document (seq) (query scoped to a single partition otherwise you could add an additional WHERE clause for seq.dataset = 'zsl_borneo'): `FROM sequences seq`\n",
    "2. For each document (seq) expand each child element in the images array (im): `im IN seq.images`\n",
    "3. Apply a cross product with the root of the item (seq) with each child element (im) the second step flattened\n",
    "4. Project each child element (im): `JOIN`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 20000\n",
      "CPU times: user 1.32 s, sys: 119 ms, total: 1.44 s\n",
      "Wall time: 29.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT TOP 200 im.file, seq.dataset, seq.location\n",
    "FROM sequences seq JOIN im IN seq.images \n",
    "WHERE ARRAY_LENGTH(im.class) = 1 AND ARRAY_CONTAINS(im.class, \"empty\") \n",
    "OR ARRAY_LENGTH(seq.class) = 1 AND ARRAY_CONTAINS(seq.class, \"empty\") \n",
    "'''\n",
    "\n",
    "# WHERE ARRAY_LENGTH(im.class) > 0 AND ARRAY_CONTAINS(im.class, \"empty\")\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### All bounding box annotations\n",
    "\n",
    "Including those sent to annotation and confirmed empty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 10\n",
      "CPU times: user 5.56 ms, sys: 1.7 ms, total: 7.26 ms\n",
      "Wall time: 62.5 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# location should always be on the sequence level\n",
    "query = '''\n",
    "SELECT TOP 10 im.bbox, im.file, seq.dataset, seq.location\n",
    "FROM sequences seq JOIN im IN seq.images \n",
    "WHERE ARRAY_LENGTH(im.bbox) >= 0\n",
    "'''\n",
    "\n",
    "partition_key = 'caltech'  # use None if querying across all partitions\n",
    "\n",
    "if partition_key is not None:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      partition_key=partition_key)\n",
    "else:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### All sequences where at least one image has bbox annotation\n",
    "\n",
    "These are sequences with at least one image that was sent for bbox annotation, which could have come back confirmed empty.\n",
    "\n",
    "Do not use join here; will result in duplicated sequence objects."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# location should always be on the sequence level\n",
    "query = '''\n",
    "SELECT TOP 10 seq\n",
    "FROM sequences seq\n",
    "WHERE (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_LENGTH(im.bbox) >= 0) > 0\n",
    "'''\n",
    "\n",
    "partition_key = 'caltech'  # use None if querying across all partitions\n",
    "\n",
    "if partition_key is not None:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      partition_key=partition_key)\n",
    "else:\n",
    "    result_iterable = container_sequences.query_items(query=query,\n",
    "                                                      enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### All images with the specified species at the image level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of results: 10\n",
      "CPU times: user 10.8 ms, sys: 2.63 ms, total: 13.4 ms\n",
      "Wall time: 177 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "species_requested = 'horse-tailed squirrel'\n",
    "\n",
    "query ='''\n",
    "SELECT TOP 10 im.class, im.file\n",
    "FROM im IN sequences.images \n",
    "WHERE ARRAY_LENGTH(im.class) > 0 AND ARRAY_CONTAINS(im.class, \"{}\")\n",
    "'''.format(species_requested)\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)\n",
    "\n",
    "print('Length of results:', len(results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Species count where the label is at the sequence level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 7min 48s, sys: 42.3 s, total: 8min 30s\n",
      "Wall time: 2h 51min 52s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT seq.class\n",
    "FROM sequences seq\n",
    "WHERE ARRAY_LENGTH(seq.class) > 0\n",
    "'''\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "species_seq = Counter()\n",
    "for item in result_iterable:\n",
    "    res = item['class']\n",
    "    species_seq.update(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "430"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(species_seq)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Species count where the label is at the image level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.17 s, sys: 573 ms, total: 5.75 s\n",
      "Wall time: 1min 55s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT TOP 100000 im.class\n",
    "FROM im IN sequences.images \n",
    "WHERE ARRAY_LENGTH(im.class) > 0\n",
    "'''\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "species_im = Counter()\n",
    "for item in result_iterable:\n",
    "    res = item['class']\n",
    "    species_im.update(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can increase the throughput of the database when doing large queries to get response faster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "species_im"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total number of sequence entries in database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "146526\n",
      "CPU times: user 13 ms, sys: 2.45 ms, total: 15.4 ms\n",
      "Wall time: 175 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT VALUE COUNT(1)\n",
    "FROM seq\n",
    "'''\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "for seq_count in result_iterable:\n",
    "    print(seq_count)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Total number of images in all sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "596773\n",
      "CPU times: user 37.8 ms, sys: 5.76 ms, total: 43.6 ms\n",
      "Wall time: 30.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT VALUE COUNT(1)\n",
    "FROM im IN sequences.images \n",
    "'''\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "for im_count in result_iterable:\n",
    "    print(im_count)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### List last inserted sequences based on insertion timestap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 12 ms, sys: 3.14 ms, total: 15.1 ms\n",
      "Wall time: 176 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT TOP 10 seq.dataset, seq._ts, seq.seq_id\n",
    "FROM sequences seq\n",
    "ORDER BY seq._ts DESC\n",
    "'''\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "results = list(result_iterable)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50869'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50870'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50871'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50872'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50873'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50874'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50875'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50876'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50877'},\n",
       " {'dataset': 'zsl_borneo', '_ts': 1573253030, 'seq_id': '50878'}]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Locations in each dataset\n",
    "\n",
    "Not yet useful."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.24 s, sys: 128 ms, total: 1.36 s\n",
      "Wall time: 25.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "query = '''\n",
    "SELECT TOP 20000 seq.location, seq.dataset\n",
    "FROM sequences seq\n",
    "WHERE seq.location != null\n",
    "'''\n",
    "\n",
    "result_iterable = container_sequences.query_items(query, enable_cross_partition_query=True)\n",
    "\n",
    "results = defaultdict(set)\n",
    "\n",
    "for item in result_iterable:\n",
    "    #if 'location' in item:\n",
    "    results[item['dataset']].add(item['location'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
